k-Nearest Neighbours (KNN)

In [1]:
import warnings
warnings.filterwarnings('ignore')
In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
In [4]:
df = pd.read_csv(r'C:\Users\..\KNNData.csv')
In [5]:
df
Out[5]:
WTT PTI EQW SBI LQE QWG FDJ PJF HQE NXJ TARGET CLASS
0 0.913917 1.162073 0.567946 0.755464 0.780862 0.352608 0.759697 0.643798 0.879422 1.231409 1
1 0.635632 1.003722 0.535342 0.825645 0.924109 0.648450 0.675334 1.013546 0.621552 1.492702 0
2 0.721360 1.201493 0.921990 0.855595 1.526629 0.720781 1.626351 1.154483 0.957877 1.285597 0
3 1.234204 1.386726 0.653046 0.825624 1.142504 0.875128 1.409708 1.380003 1.522692 1.153093 1
4 1.279491 0.949750 0.627280 0.668976 1.232537 0.703727 1.115596 0.646691 1.463812 1.419167 1
... ... ... ... ... ... ... ... ... ... ... ...
995 1.010953 1.034006 0.853116 0.622460 1.036610 0.586240 0.746811 0.319752 1.117340 1.348517 1
996 0.575529 0.955786 0.941835 0.792882 1.414277 1.269540 1.055928 0.713193 0.958684 1.663489 0
997 1.135470 0.982462 0.781905 0.916738 0.901031 0.884738 0.386802 0.389584 0.919191 1.385504 1
998 1.084894 0.861769 0.407158 0.665696 1.608612 0.943859 0.855806 1.061338 1.277456 1.188063 1
999 0.837460 0.961184 0.417006 0.799784 0.934399 0.424762 0.778234 0.907962 1.257190 1.364837 1

1000 rows × 11 columns

In [6]:
df.describe()
Out[6]:
WTT PTI EQW SBI LQE QWG FDJ PJF HQE NXJ TARGET CLASS
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.00000
mean 0.949682 1.114303 0.834127 0.682099 1.032336 0.943534 0.963422 1.071960 1.158251 1.362725 0.50000
std 0.289635 0.257085 0.291554 0.229645 0.243413 0.256121 0.255118 0.288982 0.293738 0.204225 0.50025
min 0.174412 0.441398 0.170924 0.045027 0.315307 0.262389 0.295228 0.299476 0.365157 0.639693 0.00000
25% 0.742358 0.942071 0.615451 0.515010 0.870855 0.761064 0.784407 0.866306 0.934340 1.222623 0.00000
50% 0.940475 1.118486 0.813264 0.676835 1.035824 0.941502 0.945333 1.065500 1.165556 1.375368 0.50000
75% 1.163295 1.307904 1.028340 0.834317 1.198270 1.123060 1.134852 1.283156 1.383173 1.504832 1.00000
max 1.721779 1.833757 1.722725 1.634884 1.650050 1.666902 1.713342 1.785420 1.885690 1.893950 1.00000
In [7]:
sns.pairplot(df)
Out[7]:
<seaborn.axisgrid.PairGrid at 0x1ce497e37c8>

Data Pre-processing - Feature Scaling

1. Standard Scaler

$$ \frac{x_{i} - mean(x)}{stdev(x)} $$

In [8]:
from sklearn.preprocessing import StandardScaler
In [9]:
ss = StandardScaler()
In [10]:
scaled_features = ss.fit_transform(df.drop('TARGET CLASS', axis = 1))
In [11]:
np.round(pd.DataFrame(scaled_features).describe())
Out[11]:
0 1 2 3 4 5 6 7 8 9
count 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0
mean -0.0 0.0 -0.0 -0.0 -0.0 0.0 0.0 -0.0 0.0 -0.0
std 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
min -3.0 -3.0 -2.0 -3.0 -3.0 -3.0 -3.0 -3.0 -3.0 -4.0
25% -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0
50% -0.0 0.0 -0.0 -0.0 0.0 -0.0 -0.0 -0.0 0.0 0.0
75% 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
max 3.0 3.0 3.0 4.0 3.0 3.0 3.0 2.0 2.0 3.0

2. Min-Max Scaler

$$\dfrac{x_{i} - min(x)}{max(x) - min(x)}$$

In [12]:
from sklearn.preprocessing import MinMaxScaler
In [13]:
mm = MinMaxScaler(feature_range=(0, 1)) # feature_range=(0, 1)
In [14]:
scaled_features_mm = mm.fit_transform(df.drop('TARGET CLASS', axis = 1))
In [15]:
np.round(pd.DataFrame(scaled_features_mm).describe())
Out[15]:
0 1 2 3 4 5 6 7 8 9
count 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0
mean 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 1.0
std 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
min 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
25% 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
50% 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 1.0 1.0
75% 1.0 1.0 1.0 0.0 1.0 1.0 1.0 1.0 1.0 1.0
max 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
In [16]:
pd.DataFrame(scaled_features_mm)
Out[16]:
0 1 2 3 4 5 6 7 8 9
0 0.477912 0.517593 0.255846 0.446856 0.348797 0.064235 0.327526 0.231719 0.338214 0.471767
1 0.298068 0.403864 0.234836 0.490999 0.456119 0.274872 0.268036 0.480550 0.168622 0.680091
2 0.353470 0.545904 0.483996 0.509837 0.907533 0.326371 0.938657 0.575397 0.389811 0.514970
3 0.684900 0.678940 0.310686 0.490986 0.619742 0.436264 0.785889 0.727165 0.761269 0.409326
4 0.714167 0.365101 0.294082 0.392456 0.687196 0.314229 0.578492 0.233666 0.722546 0.621463
... ... ... ... ... ... ... ... ... ... ...
995 0.540622 0.425615 0.439613 0.363198 0.540406 0.230579 0.318439 0.013646 0.494684 0.565135
996 0.259226 0.369436 0.496785 0.470391 0.823357 0.717082 0.536417 0.278421 0.390341 0.816257
997 0.621092 0.388596 0.393724 0.548295 0.438829 0.443107 0.064574 0.060641 0.364369 0.594624
998 0.588408 0.301913 0.152232 0.390393 0.968955 0.485200 0.395298 0.512713 0.599986 0.437207
999 0.428501 0.373313 0.158579 0.474733 0.463829 0.115608 0.340598 0.409495 0.586658 0.578147

1000 rows × 10 columns

3. Robust Scaler

$$ \dfrac{x_{i} - Q_{1}(x)}{Q_{3}(x) - Q_{1}(x)} $$

In [17]:
from sklearn.preprocessing import RobustScaler
In [18]:
rr = RobustScaler()
In [19]:
scaled_features_rr = rr.fit_transform(df.drop('TARGET CLASS', axis = 1))
In [20]:
np.round(pd.DataFrame(scaled_features_rr).describe())
Out[20]:
0 1 2 3 4 5 6 7 8 9
count 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0 1000.0
mean 0.0 -0.0 0.0 0.0 -0.0 0.0 0.0 0.0 -0.0 -0.0
std 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
min -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -2.0 -3.0
25% -0.0 -0.0 -0.0 -1.0 -1.0 -0.0 -0.0 -0.0 -1.0 -1.0
50% -0.0 -0.0 -0.0 0.0 0.0 0.0 0.0 -0.0 0.0 0.0
75% 1.0 1.0 1.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0
max 2.0 2.0 2.0 3.0 2.0 2.0 2.0 2.0 2.0 2.0
In [21]:
pd.DataFrame(scaled_features_mm)
Out[21]:
0 1 2 3 4 5 6 7 8 9
0 0.477912 0.517593 0.255846 0.446856 0.348797 0.064235 0.327526 0.231719 0.338214 0.471767
1 0.298068 0.403864 0.234836 0.490999 0.456119 0.274872 0.268036 0.480550 0.168622 0.680091
2 0.353470 0.545904 0.483996 0.509837 0.907533 0.326371 0.938657 0.575397 0.389811 0.514970
3 0.684900 0.678940 0.310686 0.490986 0.619742 0.436264 0.785889 0.727165 0.761269 0.409326
4 0.714167 0.365101 0.294082 0.392456 0.687196 0.314229 0.578492 0.233666 0.722546 0.621463
... ... ... ... ... ... ... ... ... ... ...
995 0.540622 0.425615 0.439613 0.363198 0.540406 0.230579 0.318439 0.013646 0.494684 0.565135
996 0.259226 0.369436 0.496785 0.470391 0.823357 0.717082 0.536417 0.278421 0.390341 0.816257
997 0.621092 0.388596 0.393724 0.548295 0.438829 0.443107 0.064574 0.060641 0.364369 0.594624
998 0.588408 0.301913 0.152232 0.390393 0.968955 0.485200 0.395298 0.512713 0.599986 0.437207
999 0.428501 0.373313 0.158579 0.474733 0.463829 0.115608 0.340598 0.409495 0.586658 0.578147

1000 rows × 10 columns

In [22]:
sns.pairplot(pd.DataFrame(scaled_features_mm))
Out[22]:
<seaborn.axisgrid.PairGrid at 0x1ce50caa888>

k-NN Model Building

In [23]:
pd.DataFrame(scaled_features)
Out[23]:
0 1 2 3 4 5 6 7 8 9
0 -0.123542 0.185907 -0.913431 0.319629 -1.033637 -2.308375 -0.798951 -1.482368 -0.949719 -0.643314
1 -1.084836 -0.430348 -1.025313 0.625388 -0.444847 -1.152706 -1.129797 -0.202240 -1.828051 0.636759
2 -0.788702 0.339318 0.301511 0.755873 2.031693 -0.870156 2.599818 0.285707 -0.682494 -0.377850
3 0.982841 1.060193 -0.621399 0.625299 0.452820 -0.267220 1.750208 1.066491 1.241325 -1.026987
4 1.139275 -0.640392 -0.709819 -0.057175 0.822886 -0.936773 0.596782 -1.472352 1.040772 0.276510
... ... ... ... ... ... ... ... ... ... ...
995 0.211653 -0.312490 0.065163 -0.259834 0.017567 -1.395721 -0.849486 -2.604264 -0.139347 -0.069602
996 -1.292453 -0.616901 0.369613 0.482648 1.569891 1.273495 0.362784 -1.242110 -0.679746 1.473448
997 0.641777 -0.513083 -0.179205 1.022255 -0.539703 -0.229680 -2.261339 -2.362494 -0.814261 0.111597
998 0.467072 -0.982786 -1.465194 -0.071465 2.368666 0.001269 -0.422041 -0.036777 0.406025 -0.855670
999 -0.387654 -0.595894 -1.431398 0.512722 -0.402552 -2.026512 -0.726253 -0.567789 0.336997 0.010350

1000 rows × 10 columns

In [25]:
df.columns[:-1]
Out[25]:
Index(['WTT', 'PTI', 'EQW', 'SBI', 'LQE', 'QWG', 'FDJ', 'PJF', 'HQE', 'NXJ'], dtype='object')
In [26]:
df_final = pd.DataFrame(scaled_features, columns=df.columns[:-1])
In [27]:
df_final
Out[27]:
WTT PTI EQW SBI LQE QWG FDJ PJF HQE NXJ
0 -0.123542 0.185907 -0.913431 0.319629 -1.033637 -2.308375 -0.798951 -1.482368 -0.949719 -0.643314
1 -1.084836 -0.430348 -1.025313 0.625388 -0.444847 -1.152706 -1.129797 -0.202240 -1.828051 0.636759
2 -0.788702 0.339318 0.301511 0.755873 2.031693 -0.870156 2.599818 0.285707 -0.682494 -0.377850
3 0.982841 1.060193 -0.621399 0.625299 0.452820 -0.267220 1.750208 1.066491 1.241325 -1.026987
4 1.139275 -0.640392 -0.709819 -0.057175 0.822886 -0.936773 0.596782 -1.472352 1.040772 0.276510
... ... ... ... ... ... ... ... ... ... ...
995 0.211653 -0.312490 0.065163 -0.259834 0.017567 -1.395721 -0.849486 -2.604264 -0.139347 -0.069602
996 -1.292453 -0.616901 0.369613 0.482648 1.569891 1.273495 0.362784 -1.242110 -0.679746 1.473448
997 0.641777 -0.513083 -0.179205 1.022255 -0.539703 -0.229680 -2.261339 -2.362494 -0.814261 0.111597
998 0.467072 -0.982786 -1.465194 -0.071465 2.368666 0.001269 -0.422041 -0.036777 0.406025 -0.855670
999 -0.387654 -0.595894 -1.431398 0.512722 -0.402552 -2.026512 -0.726253 -0.567789 0.336997 0.010350

1000 rows × 10 columns

In [29]:
df_final = pd.concat([df_final, df['TARGET CLASS']], axis = 1)
In [30]:
df_final
Out[30]:
WTT PTI EQW SBI LQE QWG FDJ PJF HQE NXJ TARGET CLASS
0 -0.123542 0.185907 -0.913431 0.319629 -1.033637 -2.308375 -0.798951 -1.482368 -0.949719 -0.643314 1
1 -1.084836 -0.430348 -1.025313 0.625388 -0.444847 -1.152706 -1.129797 -0.202240 -1.828051 0.636759 0
2 -0.788702 0.339318 0.301511 0.755873 2.031693 -0.870156 2.599818 0.285707 -0.682494 -0.377850 0
3 0.982841 1.060193 -0.621399 0.625299 0.452820 -0.267220 1.750208 1.066491 1.241325 -1.026987 1
4 1.139275 -0.640392 -0.709819 -0.057175 0.822886 -0.936773 0.596782 -1.472352 1.040772 0.276510 1
... ... ... ... ... ... ... ... ... ... ... ...
995 0.211653 -0.312490 0.065163 -0.259834 0.017567 -1.395721 -0.849486 -2.604264 -0.139347 -0.069602 1
996 -1.292453 -0.616901 0.369613 0.482648 1.569891 1.273495 0.362784 -1.242110 -0.679746 1.473448 0
997 0.641777 -0.513083 -0.179205 1.022255 -0.539703 -0.229680 -2.261339 -2.362494 -0.814261 0.111597 1
998 0.467072 -0.982786 -1.465194 -0.071465 2.368666 0.001269 -0.422041 -0.036777 0.406025 -0.855670 1
999 -0.387654 -0.595894 -1.431398 0.512722 -0.402552 -2.026512 -0.726253 -0.567789 0.336997 0.010350 1

1000 rows × 11 columns

In [31]:
from sklearn.model_selection import train_test_split
In [32]:
X = df_final.drop(['TARGET CLASS'], axis = 1)
Y = df_final['TARGET CLASS']
In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 101)

Build 1NN (where k = 1)

In [34]:
from sklearn.neighbors import KNeighborsClassifier
In [36]:
knn1 = KNeighborsClassifier(n_neighbors=1)
In [37]:
knn1.fit(X_train, y_train)
Out[37]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')
In [39]:
pred_knn1 = knn1.predict(X_test)
In [40]:
from sklearn import metrics
In [41]:
metrics.accuracy_score(y_test, pred_knn1)
Out[41]:
0.9233333333333333

Applying GridSearchCV

In [42]:
from sklearn.model_selection import GridSearchCV
In [43]:
param_grid = {'n_neighbors' : range(3,17, 2), 'p' : [1,2]}
In [61]:
list( range(3,17, 2))
Out[61]:
[3, 5, 7, 9, 11, 13, 15]
In [48]:
knn = KNeighborsClassifier()
In [68]:
knn_gscv = GridSearchCV(knn, param_grid, cv=5,verbose=1)
In [69]:
import time
In [70]:
start_time = time.time()
knn_gscv.fit(X_train, y_train)
print(time.time() - start_time, 'Seconds')
Fitting 5 folds for each of 14 candidates, totalling 70 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed:    0.5s finished
Out[70]:
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': range(3, 17, 2), 'p': [1, 2]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)
0.648594856262207 Seconds
In [72]:
knn_gscv.best_params_
knn_gscv.best_score_
Out[72]:
{'n_neighbors': 9, 'p': 2}
Out[72]:
0.94
In [74]:
pd.DataFrame(knn_gscv.cv_results_)
Out[74]:
mean_fit_time std_fit_time mean_score_time std_score_time param_n_neighbors param_p params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
0 0.002585 0.000798 0.006585 0.001369 3 1 {'n_neighbors': 3, 'p': 1} 0.943262 0.892857 0.900000 0.935714 0.899281 0.914286 0.020936 14
1 0.002196 0.001170 0.007976 0.002089 3 2 {'n_neighbors': 3, 'p': 2} 0.943262 0.900000 0.907143 0.950000 0.899281 0.920000 0.022075 13
2 0.002791 0.000744 0.007779 0.001323 5 1 {'n_neighbors': 5, 'p': 1} 0.964539 0.900000 0.914286 0.957143 0.906475 0.928571 0.026924 10
3 0.002793 0.000747 0.010172 0.001596 5 2 {'n_neighbors': 5, 'p': 2} 0.964539 0.914286 0.907143 0.957143 0.913669 0.931429 0.024340 9
4 0.002601 0.000794 0.007778 0.001595 7 1 {'n_neighbors': 7, 'p': 1} 0.943262 0.892857 0.892857 0.957143 0.920863 0.921429 0.026033 12
5 0.001995 0.000892 0.005584 0.000488 7 2 {'n_neighbors': 7, 'p': 2} 0.964539 0.928571 0.892857 0.978571 0.913669 0.935714 0.031761 3
6 0.001596 0.000489 0.005386 0.000490 9 1 {'n_neighbors': 9, 'p': 1} 0.957447 0.928571 0.907143 0.964286 0.920863 0.935714 0.021810 3
7 0.002393 0.001016 0.005785 0.000747 9 2 {'n_neighbors': 9, 'p': 2} 0.964539 0.928571 0.914286 0.971429 0.920863 0.940000 0.023451 1
8 0.002194 0.000978 0.005186 0.000398 11 1 {'n_neighbors': 11, 'p': 1} 0.964539 0.928571 0.892857 0.950000 0.928058 0.932857 0.024274 8
9 0.002992 0.001784 0.006183 0.001179 11 2 {'n_neighbors': 11, 'p': 2} 0.964539 0.921429 0.900000 0.964286 0.928058 0.935714 0.025262 3
10 0.001794 0.000398 0.005387 0.000801 13 1 {'n_neighbors': 13, 'p': 1} 0.957447 0.935714 0.892857 0.957143 0.928058 0.934286 0.023747 6
11 0.001996 0.000632 0.006640 0.001512 13 2 {'n_neighbors': 13, 'p': 2} 0.964539 0.921429 0.885714 0.964286 0.935252 0.934286 0.029486 6
12 0.001589 0.000486 0.005598 0.000796 15 1 {'n_neighbors': 15, 'p': 1} 0.957447 0.928571 0.885714 0.957143 0.913669 0.928571 0.027252 10
13 0.001787 0.000396 0.006781 0.001579 15 2 {'n_neighbors': 15, 'p': 2} 0.971631 0.921429 0.900000 0.957143 0.935252 0.937143 0.025409 2
In [76]:
# Plot a graph to observe error rate

error_rate = []


for i in  range(3,17, 2):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred = knn.predict(X_test)
    error_rate.append(np.mean(pred != y_test))
Out[76]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
Out[76]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Out[76]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')
Out[76]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')
Out[76]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')
Out[76]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=13, p=2,
                     weights='uniform')
Out[76]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=15, p=2,
                     weights='uniform')
In [77]:
error_rate
Out[77]:
[0.05,
 0.056666666666666664,
 0.06666666666666667,
 0.05,
 0.05333333333333334,
 0.05,
 0.056666666666666664]
In [88]:
plt.figure(dpi = 125)
sns.lineplot(x =range(3,17, 2), y = error_rate,marker = 'o' , color = 'red');
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: